#settings
library(ggplot2)
library(gridExtra)
library(MASS)
#load data
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following object is masked from 'package:MASS':
##
## Boston
data("Default")
default_table = table(Default$default)
default_table
##
## No Yes
## 9667 333
ans) Labels in the data are biased.
barplot(default_table,
main = "default histogram", # 히스토그램 제목
xlab = "Default Status", # x축 레이블
ylab = "Frequency", # y축 레이블
col = c("skyblue", "orange"), # 막대 색상 지정
border = "black" # 막대 테두리 색상 지정
)
ans) The samples are biased to one side.
library(ggplot2)
# Scatter plot : balance ~ income
ggplot(Default, aes(x = balance, y = income, color = default)) +
geom_point(alpha = 0.6) +
labs(x = "balance", y = "income", color = "Default Status")
# box plot : default ~ balance
ggplot(Default, aes(x = default, y = balance)) +
geom_boxplot() +
labs(x = "default", y = "balance")
# box plot : default ~ income
ggplot(Default, aes(x = default, y = income)) +
geom_boxplot() +
labs(x = "default", y = "income")
#### additional(log scale).
Default$log_balance = log(Default$balance)
Default$log_income = log(Default$income)
# Scatter plot : log_balance ~ log_income
ggplot(Default, aes(x = log_balance, y = log_income, color = default)) +
geom_point(alpha = 0.6) +
labs(x = "log_balance", y = "log_income", color = "Default Status")
# box plot : default ~ log_balance
ggplot(Default, aes(x = default, y = log_balance)) +
geom_boxplot() +
labs(x = "default", y = "log_balance")
## Warning: Removed 499 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# box plot : default ~ log_income
ggplot(Default, aes(x = default, y = log_income)) +
geom_boxplot() +
labs(x = "default", y = "log_income")
ans)
Taking a log on the data doesn’t seem appropriate.
# LDA 수행
lda_model <- lda(default ~ balance + income, data = Default)
# Scatter plot 그리기
ggplot(Default, aes(x = balance, y = income, color = default)) +
geom_point(alpha = 0.6) +
geom_abline(slope = -lda_model$scaling[2] / lda_model$scaling[1], intercept = lda_model$scaling[3] / lda_model$scaling[1], linetype = "dashed", color = "blue") +
labs(x = "Balance", y = "Income", color = "Default") +
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_abline()`).
pca_model <- prcomp(Default[, c("balance", "income")], scale = TRUE)
pca_data <- as.data.frame(pca_model$x)
ggplot(pca_data, aes(x = PC1, y = PC2, color = Default$default)) +
geom_point(alpha = 0.6) +
labs(x = "Principal Component 1 (PC1)", y = "Principal Component 2 (PC2)", color = "Default") +
theme_minimal()
ans)
Performing log, LDA, and PCA doesn’t seem significant.
set.seed(222)
model_1 = glm(default ~ income + balance, data = Default, family = "binomial")
new_data = data.frame(balance = seq(min(Default$balance), max(Default$balance), length.out = 100), income = seq(min(Default$income), max(Default$income), length.out = 100))
new_data$predicted_prob = predict(model_1, newdata = new_data, type = "response")
#binary mapping
Default$default_binary = ifelse(Default$default == "No", 0, 1)
ggplot(new_data, aes(x = balance, y = predicted_prob)) +
geom_point(data = Default, aes(x = balance, y = default_binary, color = default), alpha = 0.5) +
geom_line(color = "blue") +
labs(x = "Credit Card Balance", y = "Probability of Default", title = "Logistic Regression Model(default ~ income + balance)") +
theme_minimal()
summary(model_1)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
ans) Although both the income and the balance are statistically significant, it can be seen that the balance has a larger z-value than the income. and coefficient of balance is much bigger than income. Therefore, the balance is more significant in estimating default.
set.seed(222)
model_2 = glm(default ~ balance, data = Default, family = "binomial")
new_data = data.frame(balance = seq(min(Default$balance), max(Default$balance), length.out = 100))
new_data$predicted_prob = predict(model_2, newdata = new_data, type = "response")
#binary mapping
Default$default_binary = ifelse(Default$default == "No", 0, 1)
#draw
ggplot(new_data, aes(x = balance, y = predicted_prob)) +
geom_point(data = Default, aes(x = balance, y = default_binary, color = default), alpha = 0.5) +
geom_line(color = "blue") +
labs(x = "Credit Card Balance", y = "Probability of Default", title = "Logistic Regression Model(default ~ income + balance)") +
theme_minimal()
summary(model_2)
##
## Call:
## glm(formula = default ~ balance, family = "binomial", data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.065e+01 3.612e-01 -29.49 <2e-16 ***
## balance 5.499e-03 2.204e-04 24.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1596.5 on 9998 degrees of freedom
## AIC: 1600.5
##
## Number of Fisher Scoring iterations: 8
I divided the data into two part, training set and validation
set.
(training set : validation set = 7 : 3)
set.seed(222)
train_idx = sample(1:nrow(Default), 0.7 * nrow(Default))
train_data = Default[train_idx, ]
validation_data = Default[-train_idx, ]
model_3 = glm(default ~ balance + income, data = train_data, family = "binomial")
valid_pred = predict(model_3, newdata= validation_data, type = "response")
valid_pred_class = ifelse(valid_pred > 0.5, 1, 0)
ans) If the softmax value is higher than 0.5, it is classified as “Up”.
actual_class = validation_data$default_binary
confusion_matrix = table(actual = actual_class, predicted = valid_pred_class)
confusion_matrix
## predicted
## actual 0 1
## 0 2885 12
## 1 67 36
TN = 2885
FP = 12
FN = 67
TP = 36
BCE = -mean(actual_class * log(valid_pred) + (1 - actual_class) * log(1 - valid_pred))
accuracy = (TP + TN)/(TP + FP + TN + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
specificity = (TN) / (TN + FP)
validation_set_error = (FP + FN) / (TP + FP + TN + FN)
f1 = 2 * (precision * recall)/(precision + recall)
matrics = data.frame(
Matric = c("BCE", "accuracy","precision", "specificity", "recall", "f1-score", "validation_set_error"),
Value = c(BCE, accuracy, precision, specificity, recall, f1, validation_set_error)
)
print(matrics)
## Matric Value
## 1 BCE 0.08319640
## 2 accuracy 0.97366667
## 3 precision 0.75000000
## 4 specificity 0.99585778
## 5 recall 0.34951456
## 6 f1-score 0.47682119
## 7 validation_set_error 0.02633333
ans) In my opinion, the accuracy value was high because there are lots of “No” labeled data than the “Yes” label. additionally, recall value was lower than others.
#First
set.seed(223)
train_idx_first = sample(1:nrow(Default), 0.7 * nrow(Default))
train_data_first = Default[train_idx_first, ]
validation_data_first = Default[-train_idx_first, ]
#Second
set.seed(224)
train_idx_second = sample(1:nrow(Default), 0.7 * nrow(Default))
train_data_second = Default[train_idx_second, ]
validation_data_second = Default[-train_idx_second, ]
#Third
set.seed(225)
train_idx_third = sample(1:nrow(Default), 0.7 * nrow(Default))
train_data_third = Default[train_idx_third, ]
validation_data_third = Default[-train_idx_third, ]
#First
model_first = glm(default ~ balance + income, data = train_data_first, family = "binomial")
#Second
model_second = glm(default ~ balance + income, data = train_data_second, family = "binomial")
#Third
model_thrid = glm(default ~ balance + income, data = train_data_third, family = "binomial")
#First
valid_pred_first = predict(model_first, newdata= validation_data, type = "response")
valid_pred_class_frist = ifelse(valid_pred_first > 0.5, 1, 0)
#Second
valid_pred_second = predict(model_second, newdata= validation_data, type = "response")
valid_pred_class_second = ifelse(valid_pred_second > 0.5, 1, 0)
#Third
valid_pred_third = predict(model_thrid, newdata= validation_data, type = "response")
valid_pred_class_third = ifelse(valid_pred_third > 0.5, 1, 0)
#First
actual_class_first = validation_data_first$default_binary
confusion_matrix_first = table(actual = actual_class_first, predicted = valid_pred_class_frist)
#Second
actual_class_second = validation_data_second$default_binary
confusion_matrix_second = table(actual = actual_class_second, predicted = valid_pred_class_second)
#Third
actual_class_third = validation_data_third$default_binary
confusion_matrix_third = table(actual = actual_class_third, predicted = valid_pred_class_third)
#First
confusion_matrix_first
## predicted
## actual 0 1
## 0 2849 46
## 1 105 0
ans) It was confirmed that the case where Default is “yes” was not classified well.
TN_first = 2849
FP_first = 46
FN_first = 105
TP_first = 0
BCE_first = -mean(actual_class_first * log(valid_pred_first) + (1 - actual_class_first) * log(1 - valid_pred_first))
accuracy_first = (TP_first + TN_first)/(TP_first + FP_first + TN_first + FN_first)
precision_first = TP_first / (TP_first + FP_first)
recall_first = TP / (TP_first + FN_first)
f1_first = 2 * (precision_first * recall_first)/(precision_first + recall_first)
specificity_first = (TN_first) / (TN_first + FP_first)
validation_set_error_first = (FP_first + FN_first) / (TP_first + FP_first + TN_first + FN_first)
confusion_matrix_second
## predicted
## actual 0 1
## 0 2851 46
## 1 102 1
ans) It was confirmed that the case where Default is “yes” was not classified well.
TN_second = 2851
FP_second = 46
FN_second = 102
TP_second = 1
BCE_second = -mean(actual_class_second * log(valid_pred_second) + (1 - actual_class_second) * log(1 - valid_pred_second))
accuracy_second = (TP_second + TN_second)/(TP_second + FP_second + TN_second + FN_second)
precision_second = TP_second / (TP_second + FP_second)
recall_second = TP_second / (TP_second + FN_second)
f1_second = 2 * (precision_second * recall_second)/(precision_second + recall_second)
specificity_second = (TN_second) / (TN_second + FP_second)
validation_set_error_second = (FP_second + FN_second) / (TP_second + FP_second + TN_second + FN_second)
confusion_matrix_third
## predicted
## actual 0 1
## 0 2856 47
## 1 97 0
ans) It was confirmed that the case where Default is “yes” was not classified well.
TN_third = 2856
FP_third = 47
FN_third = 97
TP_third = 0
BCE_third = -mean(actual_class_third * log(valid_pred_third) + (1 - actual_class_third) * log(1 - valid_pred_third))
accuracy_third = (TP_third + TN_third)/(TP_third + FP_third + TN_third + FN_third)
precision_third = TP_third / (TP_third + FP_third)
recall_third = TP_third / (TP_third + FN_third)
f1_third = 2 * (precision_third * recall_third)/(precision_third + recall_third)
specificity_third = (TN_third) / (TN_third + FP_third)
validation_set_error_third = (FP_third + FN_third) / (TP_third + FP_third + TN_third + FN_third)
matrics_three_times = data.frame(
Matric = c("BCE", "accuracy", "precision", "specificity","recall", "f1-score", validation_set_error),
first = c(BCE_first, accuracy_first, precision_first, specificity_first, recall_first, f1_first, validation_set_error_first),
second = c(BCE_second, accuracy_second, precision_second, specificity_second,recall_second, f1_second, validation_set_error_second),
third = c(BCE_third, accuracy_third, precision_third, specificity_third,recall_third, f1_third, validation_set_error_third)
)
print(matrics_three_times)
## Matric first second third
## 1 BCE 0.25866531 0.245639483 0.2455566
## 2 accuracy 0.94966667 0.950666667 0.9520000
## 3 precision 0.00000000 0.021276596 0.0000000
## 4 specificity 0.98411054 0.984121505 0.9838099
## 5 recall 0.34285714 0.009708738 0.0000000
## 6 f1-score 0.00000000 0.013333333 NaN
## 7 0.0263333333333333 0.05033333 0.049333333 0.0480000
ans)For all cases, all Matrics have low recall values. because TP value had 0 about most of cases, other Matrics values(precision, f1-score…) was not clear.
library(boot)
data("Weekly")
glm.fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = "binomial")
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
ans)Only Lag2 seems statistically significant.
weekly_subset = Weekly[-1, ]
model_weekly = glm(Direction ~ Lag1 + Lag2, data = weekly_subset, family="binomial")
summary(model_weekly)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = weekly_subset)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22324 0.06150 3.630 0.000283 ***
## Lag1 -0.03843 0.02622 -1.466 0.142683
## Lag2 0.06085 0.02656 2.291 0.021971 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1494.6 on 1087 degrees of freedom
## Residual deviance: 1486.5 on 1085 degrees of freedom
## AIC: 1492.5
##
## Number of Fisher Scoring iterations: 4
ans) It has a coefficient similar to that before. Lag2(0.06025 -> 0.06085)
valid_pred_weekly = predict(model_weekly, newdata= Weekly[1, ], type = "response")
valid_pred_class_weekly = ifelse(valid_pred_weekly > 0.5, "Up", "Down")
real = as.character((Weekly[1, ]$Direction))
est = as.character(valid_pred_class_weekly)
print("*****<real>*******")
## [1] "*****<real>*******"
print(real)
## [1] "Down"
print("*****<est>*******")
## [1] "*****<est>*******"
print(est)
## [1] "Up"
set.seed(20)
errors = numeric(nrow(Weekly))
n = nrow(Weekly)
for (i in 1:n) {
model = glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family="binomial")
prediction = predict(model, newdata = Weekly[i, ], type="response")
actual = ifelse(Weekly[i, "Direction"] == "Up", 1, 0)
#binary cross-entropy
errors[i] = -actual * log(prediction) - (1 - actual) * log(1 - prediction)
}
errors_mean = mean(errors)
plot(errors, type = "l", xlab = "ith", ylab = "Binary Cross Entropy", main = "LOOCV Errors")
print(tail(errors))
## [1] 0.7404976 0.5736674 0.6686027 0.5349879 0.5605849 0.5998062
print(errors_mean)
## [1] 0.6860653
prob = rep(0, nrow(Weekly))
for (i in 1:nrow(Weekly)) {
prob[i] = predict(model, newdata=Weekly[i, ], type="response")
}
print(mean(prob))
## [1] 0.5551413
ans) The probability prediction seems to be close to Up.
plot(prob, type = "l", xlab = "ith", ylab = "probability", main = "ith observation")
##### iii. Use the posterior probability for the ith observation in
order to predict whether or not the market moves up.
Weekly$binary_direction = ifelse(Weekly$Direction == "Up", 1, 0)
prob_class = ifelse(prob > 0.5, 1, 0)
print(length(Weekly$binary_direction))
## [1] 1089
print(length(prob_class))
## [1] 1089
confusion_matrix_loocv = table(Weekly$binary_direction, prob_class)
print(confusion_matrix_loocv)
## prob_class
## 0 1
## 0 39 445
## 1 39 566
TN = 39
FP = 445
FN = 39
TP = 566
BCE = -mean(Weekly$binary_direction * log(prob) + (1 - Weekly$binary_direction) * log(1 - prob))
accuracy = (TP + TN)/(TP + FP + TN + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
specificity = (TN) / (TN + FP)
validation_set_error = (FP + FN) / (TP + FP + TN + FN)
f1 = 2 * (precision * recall)/(precision + recall)
matrics_loocv = data.frame(
Matric = c("BCE", "accuracy", "precision", "specificity","recall", "f1-score", "validation_set_error"),
Value = c(BCE, accuracy, precision, specificity, recall, f1, validation_set_error)
)
print(matrics_loocv)
## Matric Value
## 1 BCE 0.68329716
## 2 accuracy 0.55555556
## 3 precision 0.55984174
## 4 specificity 0.08057851
## 5 recall 0.93553719
## 6 f1-score 0.70049505
## 7 validation_set_error 0.44444444
ans) It shows a higher recall value than the default data set. This seems to be because the snp500 index has been steadily rising from 1990 to 2010. Other indicators are difficult to see as significant.